## Replication code for "Family Matters?"
## This is the main merge script (it won't run without non-provided data; see readme file for details)
### Summer 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu

rm(list=ls())
setwd("/nfs/projects_nobackup/f/FLvote/Texas/merging")

library(data.table)
library(sp) # Set CRS
library(spdep) # Spatial statistics and definitions
library(rgeos) # buffer and polygon analysis
library(maptools)
library(maps)
library(rgdal)

load("Harrisdefendants20002012_Arcgeocoded.Rdata")
harrismatch1 <- geodef12flat
dim(harrismatch1)
load("Harrisdefendants20122014_Arcgeocoded.Rdata")
update <- geodef12flat[!(geodef12flat$fyear==2012),]; dim(update); dim(geodef12flat); #drop 2012, to avoid dupes.
harrismatch <- rbind(harrismatch1, update); dim(harrismatch)

#trim down to Harris county addresses? using zipcode http://www.zillow.com/browse/homes/tx/harris-county/
Harriscozips <- c(63362, 77002, 77004, 77003, 77006, 77005, 77008, 77007, 77010, 77009, 77012, 77011, 75032, 77014, 77013, 77016, 63383, 77015, 77018, 77017, 77020, 77019, 77022, 77021, 77024, 77023, 77026, 77025, 77028, 77027, 77030, 77029, 77032, 77031, 77034, 77033, 77036, 77035, 77038, 77037, 77040, 77039, 77042, 77041, 77044, 77043, 77046, 77045, 77048, 77047, 77050, 77049, 77051, 77054, 77053, 77056, 77055, 77058, 28056, 77057, 77060, 77059, 77062, 77061, 77064, 77063, 77066, 77065, 77068, 77067, 77070, 77069, 76048, 77072, 77071, 77074, 77073, 77076, 77075, 77078, 77077, 77080, 77079, 77082, 77081, 77084, 77083, 77086, 77085, 77088, 77087, 77090, 75134, 77089, 77092, 77091, 77094, 78108, 77093, 77096, 77095, 77098, 77099, 77204, 63627, 75160, 77217, 77238, 77249, 77255, 75229, 77266, 77268, 76226, 77306, 77318, 77316, 77325, 76247, 77334, 78266, 77336, 77339, 77338, 77345, 76270, 77346, 77354, 33935, 77356, 77355, 77357, 77362, 77365, 77373, 77375, 77377, 77379, 77381, 77380, 77383, 77382, 77385, 80498, 77384, 77386, 77389, 77388, 77391, 77396, 77401, 76437, 77406, 77410, 77423, 77429, 77433, 77441, 76472, 77445, 77447, 77450, 77449, 77459, 76513, 77469, 77471, 77478, 77477, 77479, 77482, 77484, 77489, 77493, 77494, 77503, 77502, 77505, 77504, 36862, 77507, 77506, 77510, 77514, 77521, 77520, 77530, 77532, 77531, 77523, 77407, 77536, 77498, 77535, 77539, 79708, 77546, 77545, 77547, 77554, 77562, 77571, 77573, 77578, 77581, 77584, 77583, 77586, 77587, 77590, 77598, 77650, 77663, 75758, 78669, 79938, 77845, 78734)
dim(harrismatch)
harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] #this should also trim out most non-matches for now.
harrismatch <- harrismatch[harrismatch$Score >48,] #keep match score cutoff pretty low for now
harrismatch <- harrismatch[harrismatch$crt<16,] # keep only misdemeanants
dim(harrismatch)
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")

#clean up/subset as needed.
colnames(harrismatch)[colnames(harrismatch) == "coords.x2"] <- "Latitude" 
colnames(harrismatch)[colnames(harrismatch) == "coords.x1"] <- "Longitude" 
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")

#give defendants coordinates
harrismatch_unproj <- harrismatch #keep one non-projected
coordinates(harrismatch) <- c("Longitude", "Latitude")

proj4string(harrismatch) <- CRS(" +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") #see earlier file: when I pulled the geocoded voters in as a shapefile from Arc, this is the projection it had.
#switch to meters
harrismatchproj <- spTransform(harrismatch,CRS(" +proj=utm +zone=14 +ellps=WGS84 +datum=WGS84 +units=m +no_defs +towgs84=0,0,0")) 

#voters
library(data.table)
load("Harrisvoters2013_Arcgeocoded.Rdata")

harrisproj <- fullfile[fullfile$Status=="M",]; dim(fullfile); dim(harrisproj) #matched to an address
setnames(harrisproj, "Y",  "Latitude")
setnames(harrisproj, "X", "Longitude")
head(harrisproj$Longitude) 
harrisdf <-harrisproj #keep one plain copy

rm(fullfile)
coordinates(harrisproj) <- c("Longitude", "Latitude")
proj4string(harrisproj) <- CRS(" +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") 
harrisproj <- spTransform(harrisproj,CRS(" +proj=utm +zone=14 +ellps=WGS84 +datum=WGS84 +units=m +no_defs +towgs84=0,0,0")) 

#just restrict to zip codes and then look within those (for efficiency)
harrisproj$ARCZIPCHAR <- as.character(harrisproj$ARC_ZIP)
harrismatchproj$ARCZIPCHAR <- as.character(harrismatchproj$ARC_ZIP)


voterlistlist  <- list() #create an empty list (that will contain other lists)

harrismatchproj1 <- harrismatchproj[order(harrismatchproj$ARCZIPCHAR),]
uniquezips <- unique(harrismatchproj1$ARCZIPCHAR)
# keep only the necessary columns and merge in the rest later for speed
keep <- c("coordinates", "coords.x1", "coords.x2", "state_file_id", "ARCZIPCHAR")
harrisproj1 <- harrisproj[, (names(harrisproj) %in% keep)]

#set up function to replace the inner loop (where we go through all the defendants in the zipcode)
buffer <- function(i, smallerdefs, smallervoters){
		pointproj <- smallerdefs[i,] 
		votebuffer.small <- gBuffer(pointproj, width=3, byid=TRUE) #make the single-point buffer around the defendant
		voters.inside <- as.data.frame(smallervoters[which(gContains(votebuffer.small, smallervoters, byid=TRUE)),]) #find voters in the buffer
		if(nrow(voters.inside)>=1){ #if there are any, figure out distances and stick them in the list
			points <- voters.inside
			coordinates(points) <-  c("coords.x1", "coords.x2")
			point2 <- as.data.frame(pointproj)
			coordinates(point2) <- c("X", "Y") #just use lat/long for this calc.
			voters.inside$distances  <- spDistsN1(points,point2)  #now figure out how far each is from the neighbor.
			voters.inside$def_spn <- pointproj$def_spn
			return(voters.inside)}
	}


library(doParallel)
cl <- makeCluster(12)
registerDoParallel(cl)
voterlistlist  <- list() #create an empty list (that will contain other lists)
strt <- Sys.time()

outerloop <- function(j, harrisproj1, harrismatchproj1, uniquezips){
	zip <- uniquezips[j]
	smallervoters <- harrisproj1[harrisproj1$ARCZIPCHAR==zip,] #limit to voters in the same zip code before searching.
	smallerdefs <- harrismatchproj1[harrismatchproj1$ARCZIPCHAR==zip,] #same deal for defendants.
	over <- as.data.frame(c(1:nrow(smallerdefs)))
	voterlist <- foreach(i=1:nrow(smallerdefs)) %do% buffer(i, smallerdefs, smallervoters)
	return(voterlist) #in here is where I stick the whole voter list from this zipcode into the bigger list of lists.
}

voterlistlist <- foreach(j=1:length(uniquezips), .packages=c('rgeos', 'sp', 'spdep', 'rgdal', 'foreach')) %dopar% (
	outerloop(j, harrisproj1, harrismatchproj1, uniquezips)
)

class(voterlistlist)

voterl3 <- list()
voterlisttrim <- voterlistlist[!sapply(voterlistlist, is.null)]
for(k in 1:length(voterlisttrim)){
	voterl3[[k]] <- do.call("rbind",voterlisttrim[[k]]) #unzip one layer of lists
}
d <- do.call("rbind",voterl3)
print(Sys.time()-strt)

df1 <- d

#now merge the full voter file back in.
dim(df1)
summary(df1$distances)

df2 <- merge(df1, harrisproj, by.x="state_file_id", by.y="state_file_id", all.x=T)
dim(df1); dim(harrisproj); dim(df2)

#comment out the next 2 rows and then delete them eventually? #COMEBACK
#load("defneighbordists_20002012preelection_shortdists_apr.Rdata") #

neighborsdist <- df2
neighborsdist$meters <- (neighborsdist$distances)*1000
summary(neighborsdist$distances); summary(neighborsdist$meters)
allclose <- neighborsdist[neighborsdist$distances<=.005 & is.na(neighborsdist$distances) == F,] # hmm.
summary(allclose$distances)
dim(allclose); dim(neighborsdist)

#keep everyone close to a defendant with <10 voters at an address.
#take that df, merge it to the court records data (for crt asst, sentencing)
# also drop anyone (defendant) who appears to be themselves in the voter file
closeones <- allclose 
length(unique(closeones$def_spn)) 

library(data.table)
library(sp) 
library(spdep) 
library(rgeos) 
library(maptools)
library(maps)

load("Harrisdefendants20002012_Arcgeocoded.Rdata")
harrismatch1 <- geodef12flat
dim(harrismatch1)
load("Harrisdefendants20122014_Arcgeocoded.Rdata")
update <- geodef12flat[!(geodef12flat$fyear==2012),]; dim(update); dim(geodef12flat); #drop 2012, to avoid duplication.
harrismatch <- rbind(harrismatch1, update); dim(harrismatch)

#trim down to Harris county addresses? using zipcode http://www.zillow.com/browse/homes/tx/harris-county/
Harriscozips <- c(63362, 77002, 77004, 77003, 77006, 77005, 77008, 77007, 77010, 77009, 77012, 77011, 75032, 77014, 77013, 77016, 63383, 77015, 77018, 77017, 77020, 77019, 77022, 77021, 77024, 77023, 77026, 77025, 77028, 77027, 77030, 77029, 77032, 77031, 77034, 77033, 77036, 77035, 77038, 77037, 77040, 77039, 77042, 77041, 77044, 77043, 77046, 77045, 77048, 77047, 77050, 77049, 77051, 77054, 77053, 77056, 77055, 77058, 28056, 77057, 77060, 77059, 77062, 77061, 77064, 77063, 77066, 77065, 77068, 77067, 77070, 77069, 76048, 77072, 77071, 77074, 77073, 77076, 77075, 77078, 77077, 77080, 77079, 77082, 77081, 77084, 77083, 77086, 77085, 77088, 77087, 77090, 75134, 77089, 77092, 77091, 77094, 78108, 77093, 77096, 77095, 77098, 77099, 77204, 63627, 75160, 77217, 77238, 77249, 77255, 75229, 77266, 77268, 76226, 77306, 77318, 77316, 77325, 76247, 77334, 78266, 77336, 77339, 77338, 77345, 76270, 77346, 77354, 33935, 77356, 77355, 77357, 77362, 77365, 77373, 77375, 77377, 77379, 77381, 77380, 77383, 77382, 77385, 80498, 77384, 77386, 77389, 77388, 77391, 77396, 77401, 76437, 77406, 77410, 77423, 77429, 77433, 77441, 76472, 77445, 77447, 77450, 77449, 77459, 76513, 77469, 77471, 77478, 77477, 77479, 77482, 77484, 77489, 77493, 77494, 77503, 77502, 77505, 77504, 36862, 77507, 77506, 77510, 77514, 77521, 77520, 77530, 77532, 77531, 77523, 77407, 77536, 77498, 77535, 77539, 79708, 77546, 77545, 77547, 77554, 77562, 77571, 77573, 77578, 77581, 77584, 77583, 77586, 77587, 77590, 77598, 77650, 77663, 75758, 78669, 79938, 77845, 78734)
dim(harrismatch)
harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] 
harrismatch <- harrismatch[harrismatch$Score >48,] #drop all non-matched adds (but keep score cutoff pretty low for now)
harrismatch <- harrismatch[harrismatch$crt<16,]  #keep only misdemeanor cases
dim(harrismatch)

#clean up court records
colnames(harrismatch)[colnames(harrismatch) == "coords.x2"] <- "Latitude" 
colnames(harrismatch)[colnames(harrismatch) == "coords.x1"] <- "Longitude" 
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")
dim(harrismatch); length(unique(harrismatch$def_spn)) #few dozen dupes due to weird cases (like people ending up with different cases filed exact same day in two courts)-- drop them
harrismatch1a <-harrismatch[!(duplicated(harrismatch$def_spn) | duplicated(harrismatch$def_spn, fromLast = TRUE)), ]
harrismatch1 <- as.data.table(harrismatch1a) 

closeones_m1 <- merge(closeones, harrismatch1, by.x="def_spn", by.y="def_spn", all.x=T)
dim(closeones_m1); dim(closeones)
dim(harrismatch1); length(unique(harrismatch1$def_spn)) 

matnbors <- as.data.table(closeones_m1)
mneighbors <- matnbors[crt<16] #misdemeanants only
dim(mneighbors)

#now drop defendants who are themselves in the voter file
library(lubridate)
mneighbors$mergelname <- trimws(toupper(as.character(mneighbors$last_name))) #strip out leading/trailing spaces just in case
mneighbors[,mergeDOB := ymd(as.character(born_at))]
mneighbors[, defendantDOB := ymd(def_dob)] #clean up defendant birthdates to match.
#am looking for fname/lastname in def. name field, plus matched YOB
mneighbors[,defendantYOB:=format(defendantDOB, "%Y")]
mneighbors[,voterYOB:=format(mergeDOB, "%Y")]
head(mneighbors$voterYOB)

testFunc <- function(votername, defname) grepl(votername, defname, fixed=T)
mneighbors$samefname <- apply(mneighbors[,c('first_name','def_nam')], 1, function(y) testFunc(y['first_name'],y['def_nam'])) #is voter first name in def_name field?
mneighbors$samelname <- apply(mneighbors[,c('last_name','def_nam')], 1, function(y) testFunc(y['last_name'],y['def_nam'])) #same for last name
mneighbors[, voterdef:=0]
mneighbors[samefname==1 & samelname==1 & defendantYOB==voterYOB, voterdef:=1] #count as voter if matches 1st/last name and YOB.
mnbors <- mneighbors[voterdef!=1,] #drop defendants from the file.
dim(mneighbors)-dim(mnbors) 

##run a quick check to see set difference: should we have dropped the people we dropped?
#setkey(mneighbors, "state_file_id")
#setkey(mnbors, "state_file_id")
#dropvoters <- mneighbors[!mnbors]
#dim(dropvoters)
#head(dropvoters) #these look correct.

#set up turnout measure 2012 (and hist) and some defendant-derived covars
mnbors[, male := NA]; mnbors[def_sex=="M", male := 1]; mnbors[def_sex=="F", male := 0]
mnbors[, black := NA]; mnbors[def_rac == "B", black:= 1]; 
mnbors[(is.na(def_rac)==F) & (def_rac != "B"), black := 0]
mnbors[ageatfile < 10950,over30 := 0]
mnbors[ageatfile >= 10950,over30 := 1]

#oh, and clean up charge severity.
mnbors[, classA:= 0] #class A or B misdemeanor: carries diff. max sentence length, indicates different severity (will include as a covar later)
mnbors[mostsevcha==5, classA:= 1]

#also clean up whatever voter covars we have from the voter file.
#note that "gender" is from the voter file - it's a factor, could change if wanted.
electionday2012 <- ymd("2012-11-06")
mnbors[, voter_age:= (electionday2012 - mergeDOB)/365.25] #how old was the voter on election day 2012?
mnbors[, voter_male:=NA]
mnbors[gender=="F", voter_male:=0]
mnbors[gender=="M", voter_male:=1]

mnbors[,count:=1]
mnbors[, numberatpoint := sum(count), by=Match_addr.y]
smallneighbors <- mnbors[numberatpoint <10,] ; dim(smallneighbors) 
#quick check of size at different cutoffs: 
smallneighbors8 <- mnbors[numberatpoint <8,] ; nrow(smallneighbors8) 
smallneighbors12 <- mnbors[numberatpoint <12,] ; nrow(smallneighbors12) 
smallneighbors20 <- mnbors[numberatpoint <20,] ; nrow(smallneighbors20)

(nrow(smallneighbors)- nrow(smallneighbors8))/nrow(smallneighbors)
(nrow(smallneighbors)- nrow(smallneighbors12))/nrow(smallneighbors)
(nrow(smallneighbors)- nrow(smallneighbors20))/nrow(smallneighbors)

###########################################################################
#now merge in the updated 2014 voter file to fill in vote history 2012.
load("fullTXfile2014_minimal.Rdata")
dim(fullfiletrim) #this is the full TX file from 2014; I cut it down to just a couple of columns for this so it's not so huge to load in.
fullfiletrim[, state_file_id := as.numeric(state_file_id)]
small1 <- merge(smallneighbors, fullfiletrim, by="state_file_id"); dim(small1); dim(smallneighbors); dim(fullfiletrim) #some people dropped off.
small1 <- merge(smallneighbors, fullfiletrim, by="state_file_id" , all.x=T);dim(small1); dim(smallneighbors); dim(fullfiletrim)
smallneighbors <- small1 

smallneighbors[, vote2012 := 0] #voting
smallneighbors[vh12g1>0 & is.na(vh12g1)==F, vote2012 := 1]
sum(smallneighbors$vote2012, na.rm=T) 

#but actually use the mid-2012 file for earlier voting, since it's more complete.
smallneighbors[, vote2010 := 0]
smallneighbors[general_2010>0 & is.na(general_2010)==F, vote2010 := 1]
sum(smallneighbors$vote2010, na.rm=T) 

smallneighbors[, vote2008 := 0]
smallneighbors[general_2008>0 & is.na(general_2008)==F, vote2008 := 1]
sum(smallneighbors$vote2008, na.rm=T) 

smallneighbors[, vote2006 := 0]
smallneighbors[general_2006>0 & is.na(general_2006)==F, vote2006 := 1]
sum(smallneighbors$vote2006, na.rm=T) 

smallneighbors[, vote2004 := 0]
smallneighbors[general_2004>0 & is.na(general_2004)==F, vote2004 := 1]
sum(smallneighbors$vote2004, na.rm=T) 

#cut this down a little at first
smallneighbors$firstcase <- ymd(smallneighbors$firstcased)
dim(smallneighbors)
dim(smallneighbors)

smallneighborsfull <- smallneighbors

#keep best defendant match.
allDup <- function (value) {duplicated(value) | duplicated(value, fromLast = TRUE)} 
smallneighborsfull[,dupe := 0]; smallneighborsfull[allDup(smallneighborsfull$state_file_id), dupe:=1];table(smallneighborsfull$dupe) #find all duplicates
smallneighborsfull[,defmatchsc := 100-Score.y] #reverse scoring
nondupes <- smallneighborsfull[dupe==0,]; dupesall <- smallneighborsfull[dupe==1,]
setkey(dupesall, state_file_id, defmatchsc)
deduped <- dupesall[!duplicated(dupesall$state_file_id),] #drop everything but the first (and best-matched) voter obs
smallneighborsfull <- rbind(nondupes, deduped)
dim(smallneighborsfull)

smallneighborsfull2 <- smallneighborsfull 

smallneighborsfull2[, samename:=0] #and mark whether or not the matched household shares a last name (for robustness test below) 
testFunc <- function(votername, defname) grepl(votername, defname, fixed=T)
smallneighborsfull2$samename <- apply(smallneighborsfull2[,c('mergelname','def_nam')], 1, function(y) testFunc(y['mergelname'],y['def_nam']))
sum(smallneighborsfull2$samename)
samename <- smallneighborsfull2[samename==1,] 
dim(samename); dim(smallneighborsfull2)
dim(samename)/dim(smallneighborsfull2)#just under 40%

smallneighborsfull2[, agediff:= as.numeric(abs(defendantDOB - mergeDOB)/365.25)] #calculate age difference in years (absolute)
smallneighborsfull2[, agediff1:= as.numeric(defendantDOB- mergeDOB)/365.25] #calculate age difference in years (defendant goes first because that DOB should be "bigger"/more recent")


### Next: merging in tax assessment data to find homeowners (for SI Tables SI14-SI15)
load("/nfs/projects_nobackup/f/FLvote/Texas/merging/harrisassessment2008_ownerspropertiesmerged.Rdata") #"ownacct" 
#window <- smallneighborsfull2[(smallneighborsfull2$firstcase >= "2012-01-01" & smallneighborsfull2$firstcase <= "2013-12-31"), ]; dim(window); dim(smallneighborsfull2)
#smallneighborsfull2 <- window #just keep the dates I might reasonably use.

head(ownacct$name) #note that names don't seem reliably formatted.
## approach: look for name matches(voter first and last names both appear in the owner name field), plus restrict based on some address data (town, zip to start with)
ownacct$numzip <-as.numeric(ownacct$site_addr_3) 
smallneighborsfull2$numzip <- as.numeric(smallneighborsfull2$ARCZIPCHAR.y) 
smallneighborsfull2$vlastname <- toupper(as.character(smallneighborsfull2$last_name))
smallneighborsfull2$vfirstname <- toupper(as.character(smallneighborsfull2$first_name))
Sys.setlocale('LC_ALL','C') 

ptm <- proc.time() #optimize a little so this doesn't take days.  
library(doParallel)
cl <- makeCluster(6)
registerDoParallel(cl)

taxmatch <- function(i, ownacct, smallneighborsfull2){
	match1 <- ownacct[grepl(smallneighborsfull2$vlastname[i], ownacct$name)==T, ] #is last name of voter in the name field for owners?
	namesmatch <- match1[grepl(smallneighborsfull2$vfirstname[i], match1$name)==T, ] #first name?
	allmatch <- namesmatch[namesmatch$numzip == smallneighborsfull2$numzip[i], ] #zip code?
	if(nrow(allmatch)>0) allmatch$voter <- smallneighborsfull2$state_file_id[i] #add it to the list
	return(allmatch)
}

taxmatches <- foreach(i = 1:nrow(smallneighborsfull2), .combine=rbind) %dopar% (taxmatch(i, ownacct, smallneighborsfull2))
proc.time()-ptm

dim(taxmatches) #lots
length(unique(taxmatches$voter)) #but not really- just lots of duplicates for a few people.

#now I want to merge this to the main data and then trim out bad matches using additional data.
smallneighborsfull2[, nbec_guid.x:=NULL]; smallneighborsfull2[, nbec_guid.y:=NULL]; #remove duplicated/unnecessary columns 
smallneighborsfull2[, index:= 1:.N]
taxmatches$ownersfile <- 1
fullmerge <- merge(smallneighborsfull2, taxmatches, by.x="state_file_id", by.y="voter", all.x=T)
dim(fullmerge); dim(smallneighborsfull2)
unmatched <- fullmerge[is.na(fullmerge$ownersfile)==T,]; dim(unmatched) #most of them
matches <- fullmerge[fullmerge$ownersfile==1,]; dim(matches)
#one quick cut: see if the street numbers are even close
matches$housenum <-as.numeric(gsub("([0-9]+).*$", "\\1", matches$site_addr_1)) #pull out house numbers from assessment data
matches$def_housenum <- as.numeric(trimws(matches$def_stnum))
matches$streetnum <- ifelse(matches$housenum == matches$def_housenum, 1, 0)
sum(matches$streetnum, na.rm=T)
goodmatch <- matches[matches$streetnum==1,]

#maybe an additional filter: fuzzy match on street name, just to make sure they're roughly similar? (though skimming through, they look quite accurate already)
require(stringdist)
goodmatch$ownstreet <- gsub("\\d", "",  goodmatch$site_addr_1)
goodmatch$stringdist_street <- mapply(stringdist, goodmatch$def_stnam, goodmatch$ownstreet, method="jw", p=0)
summary(goodmatch$stringdist_street)

unsure <- goodmatch[goodmatch$stringdist_street<.35 & goodmatch$stringdist_street>.2,]; dim(unsure); head(unsure) #how bad is bad? these all look totally fine but with typos, etc.
unsure <- goodmatch[goodmatch$stringdist_street<.5 & goodmatch$stringdist_street>.35,]; dim(unsure); head(unsure) #okay, some of these are wrong

bestmatch <- goodmatch[goodmatch$stringdist_street<.45,]
dim(bestmatch)

smallneighborsfull2[, homeowner:=0]
smallneighborsfull2[state_file_id %in% bestmatch$state_file_id, homeowner:=1]
table(smallneighborsfull2$homeowner); dim(bestmatch)

#finally, merge in a list of voters that have been matched to felony cases (so we can exclude them in a robustness check, given they're kind of "treated")
load("HHsmatched0914.Rdata") #this is generated at the bottom of "Mainmerge_spring2018_felonyversion.R"
smallneighborsfull2[,felondrop:=0]
smallneighborsfull2[state_file_id %in% sn0914$state_file_id,felondrop:=1]


#######################################################################
## generate the de-identified dataset to be used for the main analysis
#######################################################################

## need to keep def. identifier for clustering, but don't need it to be actual court system def_spn-- hash it
library(digest)
vdigest <- Vectorize(digest)
smallneighborsfull2$hashedID <- vdigest(smallneighborsfull2$def_spn)

colnames(smallneighborsfull2)
keeps <- c("residential_zip5", "def_rac", "def_sex", "ageatfile", "fyear", "firstcase", "totalsente", "anyjail", "anyconv", "anyfine", "anyprobati", "mostsevcha", "male", "black", "over30", "classA", "voter_age", "voter_male", "numberatpoint", "vote2012", "vote2008", "vote2004","vote2010", "vote2006", "samename", "hashedID", "agediff", "agediff1", "homeowner", "felondrop") 

deid <-  subset(smallneighborsfull2, select=keeps)
setnames(deid, "hashedID", "def_spn") #rename the hashed version to the original column name so all the original code works. 
smallneighborsallcols <- smallneighborsfull2
smallneighborsfull2 <- deid

save(smallneighborsfull2, file="main_householdsmerged_deidentified.Rdata") #this is the main dataset to be used for analysis.

#Note about felony cases: the above code can be modified slightly to generate the same dataset but for people with proximal contact with felony cases, simply by changing the courtrooms included in the analysis (and changing the resulting filename).  This happens in "Mainmerge_spring2018_felonyversion.R"

#####################################################################################################
## next, generate a couple of weird one-off merged/deidentified datasets for supplementary analyses. 

#Start with: a copy of the full voter file with indicators for various proximal contact treatments (this lets us compare to the full registered voter population and do our "naive OLS" for the SI)

sn0914 <-  smallneighborsallcols[firstcase > "2008-11-04",] #this is everybody after 2008 election, including ones after the 2012 election.

load("Harrisvoters2013_Arcgeocoded.Rdata") #base voter file
unmatched <- fullfile[!(fullfile$state_file_id %in% sn0914$state_file_id), ]; dim(unmatched); dim(fullfile)

load("fullTXfile2014_minimal.Rdata") #updated 2014 file, for 2012 vote.
fullfiletrim[, state_file_id := as.numeric(state_file_id)]
small1 <- merge(unmatched, fullfiletrim, by="state_file_id", all.x=T); dim(small1); dim(unmatched); dim(fullfiletrim) 

small1[, vote2012 := 0] #now figure out 2012 voting
small1[vh12g1>0 & is.na(vh12g1)==F, vote2012 := 1]
sum(small1$vote2012, na.rm=T) 

#plus clean up past voting (from the first file) and other voter covars.
small1[, vote2010 := 0]; small1[general_2010>0 & is.na(general_2010)==F, vote2010 := 1]
small1[, vote2008 := 0];small1[general_2008>0 & is.na(general_2008)==F, vote2008 := 1]
small1[, vote2006 := 0]; small1[general_2006>0 & is.na(general_2006)==F, vote2006 := 1]
small1[, vote2004 := 0]; small1[general_2004>0 & is.na(general_2004)==F, vote2004 := 1]

small1[,mergeDOB := ymd(as.character(born_at))]
electionday2012 <- ymd("2012-11-06"); small1[, voter_age:= ((electionday2012 - mergeDOB)/365.25)] #how old was the voter on election day 2012?
small1[, voter_male:=NA]; small1[gender=="F", voter_male:=0]; small1[gender=="M", voter_male:=1]

#now keep needed columns from each and stick them together.
matched_trim <- subset(sn0914, select=c("vote2012","vote2010","vote2008","vote2006","vote2004","voter_male","voter_age", "registered_at",  "firstcase", "anyconv", "anyprobati", "anyjail"))
matched_trim[, treated:= 1]
unmatched_trim <- subset(small1, select=c("vote2012","vote2010","vote2008","vote2006","vote2004","voter_male","voter_age", "registered_at")) 
unmatched_trim[, treated:= 0]

dim(matched_trim); dim(unmatched_trim)
wholefile <- rbind(matched_trim, unmatched_trim, fill=T) #this way it makes firstcase NA for the unmatched ones
dim(wholefile)

colnames(wholefile) 

save(wholefile, file="households_plusfullfile_deidentified.Rdata")



#and now also do a plot of just the count of cases (ignoring matching-- do we see over-time variation here?). for this use "harrismatch"-- defendants from before the match. 
checkweeks <- c(-52:52)
storage <- as.data.frame(matrix(NA, nrow=length(checkweeks), ncol=2))
for (i in 1:nrow(storage)){
	week <- checkweeks[i]
	lower <- electionday2012 + week*7; higher <- lower+7 #add/subtract off days to get the window
	window <- harrismatch[harrismatch$firstcase_date < higher & harrismatch$firstcase_date >= lower, ]
	storage[i,1] <- week
	storage[i,2] <- nrow(window)
}

colnames(storage) <- c("week", "n")

pdf("beforeafter_checkcasecounts.pdf")
plot(storage$week, storage$n, main= "Number of First-Time Misdemeanor Cases",ylim=c(0,420), xlab="Charge Date: Weeks From Election", ylab="Number of Defendants", cex.main=1.5, cex.axis=1.5,cex=1.5, cex.lab=1.45)
abline(v=0, lty=2) 
match.lo <- loess(storage$n ~ storage$week)
j <- order(storage$week)
lines(storage$week[j],match.lo$fitted[j],col="red",lwd=3)
dev.off() #Figure SI5

#and how many first-time misd. defs were there? (matched to voters) 
colnames(harrismatch); dim(harrismatch)
h12 <- harrismatch[harrismatch$firstcase_date >=  "2012-01-01"& harrismatch$firstcase_date <=  "2012-12-31",]; dim(h12)



